title: "ML Methods" author:
- name: Mahira Ayub
affiliations:
- id: bu name: Boston University city: Boston state: MA
- name: Ava Godsy
affiliations:
- ref: bu
- name: Joshua Lawrence
affiliations:
- ref: bu date: today format: html: theme: minty bibliography: references.bib csl: csl/econometrica.csl toc: true
InĀ [Ā ]:
import os
# Set JAVA_HOME
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-17' # Update to your exact path
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"
# Stop any existing Spark sessions
SparkSession.getActiveSession() and SparkSession.getActiveSession().stop()
# Initialize Spark Session with explicit local master
spark = SparkSession.builder \
.appName("LightcastData") \
.master("local[*]") \
.config("spark.driver.host", "localhost") \
.getOrCreate()
# Load Data
df = spark.read \
.option("header", "true") \
.option("inferSchema", "true") \
.option("multiLine", "true") \
.option("escape", "\"") \
.csv("data/lightcast_job_postings.csv")
# Show Schema and Sample Data
# print("---This is Diagnostic check, No need to print it in the final doc---")
# df.printSchema()
# df.show(5)
---This is Diagnostic check, No need to print it in the final doc---
root
|-- ID: string (nullable = true)
|-- LAST_UPDATED_DATE: string (nullable = true)
|-- LAST_UPDATED_TIMESTAMP: timestamp (nullable = true)
|-- DUPLICATES: integer (nullable = true)
|-- POSTED: string (nullable = true)
|-- EXPIRED: string (nullable = true)
|-- DURATION: integer (nullable = true)
|-- SOURCE_TYPES: string (nullable = true)
|-- SOURCES: string (nullable = true)
|-- URL: string (nullable = true)
|-- ACTIVE_URLS: string (nullable = true)
|-- ACTIVE_SOURCES_INFO: string (nullable = true)
|-- TITLE_RAW: string (nullable = true)
|-- BODY: string (nullable = true)
|-- MODELED_EXPIRED: string (nullable = true)
|-- MODELED_DURATION: integer (nullable = true)
|-- COMPANY: integer (nullable = true)
|-- COMPANY_NAME: string (nullable = true)
|-- COMPANY_RAW: string (nullable = true)
|-- COMPANY_IS_STAFFING: boolean (nullable = true)
|-- EDUCATION_LEVELS: string (nullable = true)
|-- EDUCATION_LEVELS_NAME: string (nullable = true)
|-- MIN_EDULEVELS: integer (nullable = true)
|-- MIN_EDULEVELS_NAME: string (nullable = true)
|-- MAX_EDULEVELS: integer (nullable = true)
|-- MAX_EDULEVELS_NAME: string (nullable = true)
|-- EMPLOYMENT_TYPE: integer (nullable = true)
|-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
|-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
|-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
|-- IS_INTERNSHIP: boolean (nullable = true)
|-- SALARY: integer (nullable = true)
|-- REMOTE_TYPE: integer (nullable = true)
|-- REMOTE_TYPE_NAME: string (nullable = true)
|-- ORIGINAL_PAY_PERIOD: string (nullable = true)
|-- SALARY_TO: integer (nullable = true)
|-- SALARY_FROM: integer (nullable = true)
|-- LOCATION: string (nullable = true)
|-- CITY: string (nullable = true)
|-- CITY_NAME: string (nullable = true)
|-- COUNTY: integer (nullable = true)
|-- COUNTY_NAME: string (nullable = true)
|-- MSA: integer (nullable = true)
|-- MSA_NAME: string (nullable = true)
|-- STATE: integer (nullable = true)
|-- STATE_NAME: string (nullable = true)
|-- COUNTY_OUTGOING: integer (nullable = true)
|-- COUNTY_NAME_OUTGOING: string (nullable = true)
|-- COUNTY_INCOMING: integer (nullable = true)
|-- COUNTY_NAME_INCOMING: string (nullable = true)
|-- MSA_OUTGOING: integer (nullable = true)
|-- MSA_NAME_OUTGOING: string (nullable = true)
|-- MSA_INCOMING: integer (nullable = true)
|-- MSA_NAME_INCOMING: string (nullable = true)
|-- NAICS2: integer (nullable = true)
|-- NAICS2_NAME: string (nullable = true)
|-- NAICS3: integer (nullable = true)
|-- NAICS3_NAME: string (nullable = true)
|-- NAICS4: integer (nullable = true)
|-- NAICS4_NAME: string (nullable = true)
|-- NAICS5: integer (nullable = true)
|-- NAICS5_NAME: string (nullable = true)
|-- NAICS6: integer (nullable = true)
|-- NAICS6_NAME: string (nullable = true)
|-- TITLE: string (nullable = true)
|-- TITLE_NAME: string (nullable = true)
|-- TITLE_CLEAN: string (nullable = true)
|-- SKILLS: string (nullable = true)
|-- SKILLS_NAME: string (nullable = true)
|-- SPECIALIZED_SKILLS: string (nullable = true)
|-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
|-- CERTIFICATIONS: string (nullable = true)
|-- CERTIFICATIONS_NAME: string (nullable = true)
|-- COMMON_SKILLS: string (nullable = true)
|-- COMMON_SKILLS_NAME: string (nullable = true)
|-- SOFTWARE_SKILLS: string (nullable = true)
|-- SOFTWARE_SKILLS_NAME: string (nullable = true)
|-- ONET: string (nullable = true)
|-- ONET_NAME: string (nullable = true)
|-- ONET_2019: string (nullable = true)
|-- ONET_2019_NAME: string (nullable = true)
|-- CIP6: string (nullable = true)
|-- CIP6_NAME: string (nullable = true)
|-- CIP4: string (nullable = true)
|-- CIP4_NAME: string (nullable = true)
|-- CIP2: string (nullable = true)
|-- CIP2_NAME: string (nullable = true)
|-- SOC_2021_2: string (nullable = true)
|-- SOC_2021_2_NAME: string (nullable = true)
|-- SOC_2021_3: string (nullable = true)
|-- SOC_2021_3_NAME: string (nullable = true)
|-- SOC_2021_4: string (nullable = true)
|-- SOC_2021_4_NAME: string (nullable = true)
|-- SOC_2021_5: string (nullable = true)
|-- SOC_2021_5_NAME: string (nullable = true)
|-- LOT_CAREER_AREA: integer (nullable = true)
|-- LOT_CAREER_AREA_NAME: string (nullable = true)
|-- LOT_OCCUPATION: integer (nullable = true)
|-- LOT_OCCUPATION_NAME: string (nullable = true)
|-- LOT_SPECIALIZED_OCCUPATION: integer (nullable = true)
|-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
|-- LOT_OCCUPATION_GROUP: integer (nullable = true)
|-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
|-- LOT_V6_SPECIALIZED_OCCUPATION: integer (nullable = true)
|-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
|-- LOT_V6_OCCUPATION: integer (nullable = true)
|-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
|-- LOT_V6_OCCUPATION_GROUP: integer (nullable = true)
|-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
|-- LOT_V6_CAREER_AREA: integer (nullable = true)
|-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
|-- SOC_2: string (nullable = true)
|-- SOC_2_NAME: string (nullable = true)
|-- SOC_3: string (nullable = true)
|-- SOC_3_NAME: string (nullable = true)
|-- SOC_4: string (nullable = true)
|-- SOC_4_NAME: string (nullable = true)
|-- SOC_5: string (nullable = true)
|-- SOC_5_NAME: string (nullable = true)
|-- LIGHTCAST_SECTORS: string (nullable = true)
|-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
|-- NAICS_2022_2: integer (nullable = true)
|-- NAICS_2022_2_NAME: string (nullable = true)
|-- NAICS_2022_3: integer (nullable = true)
|-- NAICS_2022_3_NAME: string (nullable = true)
|-- NAICS_2022_4: integer (nullable = true)
|-- NAICS_2022_4_NAME: string (nullable = true)
|-- NAICS_2022_5: integer (nullable = true)
|-- NAICS_2022_5_NAME: string (nullable = true)
|-- NAICS_2022_6: integer (nullable = true)
|-- NAICS_2022_6_NAME: string (nullable = true)
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
| ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES| POSTED| EXPIRED|DURATION| SOURCE_TYPES| SOURCES| URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO| TITLE_RAW| BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY| COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM| LOCATION| CITY| CITY_NAME|COUNTY| COUNTY_NAME| MSA| MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING| MSA_NAME_OUTGOING|MSA_INCOMING| MSA_NAME_INCOMING|NAICS2| NAICS2_NAME|NAICS3| NAICS3_NAME|NAICS4| NAICS4_NAME|NAICS5| NAICS5_NAME|NAICS6| NAICS6_NAME| TITLE| TITLE_NAME| TITLE_CLEAN| SKILLS| SKILLS_NAME| SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME| CERTIFICATIONS| CERTIFICATIONS_NAME| COMMON_SKILLS| COMMON_SKILLS_NAME| SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME| ONET| ONET_NAME| ONET_2019| ONET_2019_NAME| CIP6| CIP6_NAME| CIP4| CIP4_NAME| CIP2| CIP2_NAME|SOC_2021_2| SOC_2021_2_NAME|SOC_2021_3| SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME| SOC_2| SOC_2_NAME| SOC_3| SOC_3_NAME| SOC_4| SOC_4_NAME| SOC_5| SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2| NAICS_2022_2_NAME|NAICS_2022_3| NAICS_2022_3_NAME|NAICS_2022_4| NAICS_2022_4_NAME|NAICS_2022_5| NAICS_2022_5_NAME|NAICS_2022_6| NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...| 9/6/2024| 2024-09-06 13:32:...| 0|6/2/2024| 6/8/2024| 6| [\n "Company"\n]|[\n "brassring.c...|[\n "https://sjo...| []| null|Enterprise Analys...|31-May-2024\n\nEn...| 6/8/2024| 6| 894731| Murphy USA| Murphy USA| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| null| null| 1|Full-time (> 32 h...| 2| 2| false| null| 0| [None]| null| null| null|{\n "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR| 5139| Union, AR|20980| El Dorado, AR| 5| Arkansas| 5139| Union, AR| 5139| Union, AR| 20980| El Dorado, AR| 20980| El Dorado, AR| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n "KS126DB6T06...|[\n "Merchandisi...|[\n "KS126DB6T06...| [\n "Merchandisi...| []| []|[\n "KS126706DPF...|[\n "Mathematics...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "45.0601",\n...|[\n "Economics, ...|[\n "45.06",\n ...|[\n "Economics",...|[\n "45",\n "27...|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...| 441330|Automotive Parts ...|
|0cb072af26757b6c4...| 8/2/2024| 2024-08-02 10:08:...| 0|6/2/2024| 8/1/2024| null| [\n "Job Board"\n]| [\n "maine.gov"\n]|[\n "https://job...| []| null|Oracle Consultant...|Oracle Consultant...| 8/1/2024| null| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| null| null| 1|Full-time (> 32 h...| 3| 3| false| null| 1| Remote| null| null| null|{\n "lat": 44.31...| QXVndXN0YSwgTUU=| Augusta, ME| 23011| Kennebec, ME|12300|Augusta-Watervill...| 23| Maine| 23011| Kennebec, ME| 23011| Kennebec, ME| 12300|Augusta-Watervill...| 12300|Augusta-Watervill...| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| null| null| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...|
|85318b12b3331fa49...| 9/6/2024| 2024-09-06 13:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| null| Data Analyst|Taking care of pe...| 6/10/2024| 8|39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| null| null| 1|Full-time (> 32 h...| 5| null| false| null| 0| [None]| null| null| null|{\n "lat": 32.77...| RGFsbGFzLCBUWA==| Dallas, TX| 48113| Dallas, TX|19100|Dallas-Fort Worth...| 48| Texas| 48113| Dallas, TX| 48113| Dallas, TX| 19100|Dallas-Fort Worth...| 19100|Dallas-Fort Worth...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| null| null| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting|
|1b5c3941e54a1889e...| 9/6/2024| 2024-09-06 13:32:...| 1|6/2/2024|7/20/2024| 48| [\n "Job Board"\n]|[\n "disabledper...|[\n "https://www...| []| null|Sr. Lead Data Mgm...|About this role:\...| 6/12/2024| 10|37615159| Wells Fargo|Wells Fargo| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| null| null| 1|Full-time (> 32 h...| 3| null| false| null| 0| [None]| null| null| null|{\n "lat": 33.44...| UGhvZW5peCwgQVo=| Phoenix, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking|522110| Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n "KS123QX62QY...|[\n "Exit Strate...|[\n "KS123QX62QY...| [\n "Exit Strate...| []| []|[\n "KS7G6NP6R6L...|[\n "Reliability...|[\n "KS4409D76NW...|[\n "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking| 522110| Commercial Banking|
|cb5ca25f02bdf25c1...| 6/19/2024| 2024-06-19 00:00:00| 0|6/2/2024|6/17/2024| 15|[\n "FreeJobBoar...|[\n "craigslist....|[\n "https://mod...| []| null|Comisiones de $10...|Comisiones de $10...| 6/17/2024| 15| 0| Unclassified| LH/GM| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| null| null| 3|Part-time / full-...| null| null| false| 92500| 0| [None]| year| 150000| 35000|{\n "lat": 37.63...| TW9kZXN0bywgQ0E=| Modesto, CA| 6099|Stanislaus, CA|33700| Modesto, CA| 6|California| 6099| Stanislaus, CA| 6099| Stanislaus, CA| 33700| Modesto, CA| 33700| Modesto, CA| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000| Unclassified|comisiones de por...| []| []| []| []| []| []| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| null| null| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
InĀ [2]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.stat import Correlation
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# Filter for valid salary data
df_clean = df.filter(
(F.col("SALARY").isNotNull()) &
(F.col("SALARY") > 0) &
(F.col("STATE_NAME").isNotNull()) &
(F.col("TITLE_NAME").isNotNull())
)
print(f"Original dataset size: {df.count():,}")
print(f"Cleaned dataset size: {df_clean.count():,}")
# Calculate salary statistics
salary_stats = df_clean.select(
F.mean("SALARY").alias("mean_salary"),
F.expr("percentile_approx(SALARY, 0.5)").alias("median_salary"),
F.stddev("SALARY").alias("std_salary"),
F.min("SALARY").alias("min_salary"),
F.max("SALARY").alias("max_salary")
).collect()[0]
print(f"\nSalary Statistics:")
print(f" Mean: ${salary_stats['mean_salary']:,.2f}")
print(f" Median: ${salary_stats['median_salary']:,.2f}")
print(f" Std Dev: ${salary_stats['std_salary']:,.2f}")
print(f" Min: ${salary_stats['min_salary']:,.2f}")
print(f" Max: ${salary_stats['max_salary']:,.2f}")
# Create binary classification target (above average = 1, below average = 0)
avg_salary = salary_stats['mean_salary']
df_clean = df_clean.withColumn(
"ABOVE_AVERAGE_SALARY",
F.when(F.col("SALARY") > avg_salary, 1).otherwise(0)
)
Original dataset size: 72,498 Cleaned dataset size: 30,808 Salary Statistics: Mean: $117,953.76 Median: $116,300.00 Std Dev: $45,133.88 Min: $15,860.00 Max: $500,000.00
InĀ [3]:
# Handle SKILLS_NAME - it may contain arrays or null values
# Extract first skill or mark as "No Skills Listed"
df_clean = df_clean.withColumn(
"PRIMARY_SKILL",
F.when(
F.col("SKILLS_NAME").isNotNull(),
F.split(F.regexp_replace(F.col("SKILLS_NAME"), r'[\[\]"\n]', ''), ",").getItem(0)
).otherwise("No Skills Listed")
)
# Select features and target, and clean empty strings
features_df = df_clean.select(
"STATE_NAME",
"TITLE_NAME",
"PRIMARY_SKILL",
"SALARY",
"ABOVE_AVERAGE_SALARY"
).withColumn(
"STATE_NAME",
F.when((F.col("STATE_NAME").isNull()) | (F.trim(F.col("STATE_NAME")) == ""), "Unknown")
.otherwise(F.trim(F.col("STATE_NAME")))
).withColumn(
"TITLE_NAME",
F.when((F.col("TITLE_NAME").isNull()) | (F.trim(F.col("TITLE_NAME")) == ""), "Unknown")
.otherwise(F.trim(F.col("TITLE_NAME")))
).withColumn(
"PRIMARY_SKILL",
F.when((F.col("PRIMARY_SKILL").isNull()) | (F.trim(F.col("PRIMARY_SKILL")) == ""), "No Skills Listed")
.otherwise(F.trim(F.col("PRIMARY_SKILL")))
)
# Show feature distribution
print("\nTop 10 States by Job Postings:")
features_df.groupBy("STATE_NAME").count().orderBy(F.desc("count")).show(10, truncate=False)
print("\nTop 10 Job Titles by Frequency:")
features_df.groupBy("TITLE_NAME").count().orderBy(F.desc("count")).show(10, truncate=False)
print("\nTop 10 Skills by Frequency:")
features_df.groupBy("PRIMARY_SKILL").count().orderBy(F.desc("count")).show(10, truncate=False)
Top 10 States by Job Postings: +--------------+-----+ |STATE_NAME |count| +--------------+-----+ |California |3984 | |Texas |2544 | |New York |1996 | |Florida |1504 | |Virginia |1347 | |Illinois |1271 | |North Carolina|962 | |Colorado |943 | |Washington |933 | |Ohio |927 | +--------------+-----+ only showing top 10 rows Top 10 Job Titles by Frequency: +-------------------------------+-----+ |TITLE_NAME |count| +-------------------------------+-----+ |Data Analysts |3632 | |Business Intelligence Analysts |991 | |Unclassified |976 | |Oracle Cloud HCM Consultants |714 | |Enterprise Architects |699 | |Data Analytics Engineers |349 | |Data and Reporting Analysts |319 | |Data Governance Analysts |288 | |Principal Architects |227 | |Enterprise Solutions Architects|225 | +-------------------------------+-----+ only showing top 10 rows Top 10 Skills by Frequency: +--------------------+-----+ |PRIMARY_SKILL |count| +--------------------+-----+ |Power BI |2131 | |Management |1489 | |Customer Service |1467 | |Research |1381 | |Business Objectives |1257 | |Detail Oriented |985 | |Relational Databases|834 | |SAP S/4HANA |820 | |Salesforce |670 | |Presentations |643 | +--------------------+-----+ only showing top 10 rows
InĀ [4]:
# String indexing for categorical variables
state_indexer = StringIndexer(inputCol="STATE_NAME", outputCol="STATE_INDEX", handleInvalid="keep")
title_indexer = StringIndexer(inputCol="TITLE_NAME", outputCol="TITLE_INDEX", handleInvalid="keep")
skill_indexer = StringIndexer(inputCol="PRIMARY_SKILL", outputCol="SKILL_INDEX", handleInvalid="keep")
# One-hot encoding
state_encoder = OneHotEncoder(inputCol="STATE_INDEX", outputCol="STATE_VEC")
title_encoder = OneHotEncoder(inputCol="TITLE_INDEX", outputCol="TITLE_VEC")
skill_encoder = OneHotEncoder(inputCol="SKILL_INDEX", outputCol="SKILL_VEC")
# Assemble features
assembler = VectorAssembler(
inputCols=["STATE_VEC", "TITLE_VEC", "SKILL_VEC"],
outputCol="features"
)
# Split data (80% train, 20% test)
train_data, test_data = features_df.randomSplit([0.8, 0.2], seed=42)
print(f"Training set size: {train_data.count():,}")
print(f"Test set size: {test_data.count():,}")
Training set size: 24,676 Test set size: 6,132
InĀ [5]:
# Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="SALARY", maxIter=100, regParam=0.1)
# Create pipeline
lr_pipeline = Pipeline(stages=[
state_indexer, title_indexer, skill_indexer,
state_encoder, title_encoder, skill_encoder,
assembler, lr
])
# Train the model
lr_model = lr_pipeline.fit(train_data)
# Make predictions
lr_predictions = lr_model.transform(test_data)
# Evaluate the model
evaluator_rmse = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction", metricName="r2")
evaluator_mae = RegressionEvaluator(labelCol="SALARY", predictionCol="prediction", metricName="mae")
rmse = evaluator_rmse.evaluate(lr_predictions)
r2 = evaluator_r2.evaluate(lr_predictions)
mae = evaluator_mae.evaluate(lr_predictions)
print(f"\nLinear Regression Model Performance:")
print(f" RMSE: ${rmse:,.2f}")
print(f" R² Score: {r2:.4f}")
print(f" MAE: ${mae:,.2f}")
# Show sample predictions
print("\nSample Predictions:")
lr_predictions.select("STATE_NAME", "TITLE_NAME", "PRIMARY_SKILL", "SALARY", "prediction").show(10, truncate=50)
Linear Regression Model Performance: RMSE: $31,213.94 R² Score: 0.5332 MAE: $21,417.79 Sample Predictions: +----------+--------------------------------------------+-------------------+------+------------------+ |STATE_NAME| TITLE_NAME| PRIMARY_SKILL|SALARY| prediction| +----------+--------------------------------------------+-------------------+------+------------------+ | Alabama| Analytics Engineers| Reliability|134500|134287.38233297397| | Alabama| Business Intelligence Analysts|Business Objectives|105200| 108386.078482199| | Alabama| Business Intelligence Analysts| Data Warehousing|112097|107321.68487090743| | Alabama| Business Intelligence Analysts| Power BI| 59582| 98300.68285045939| | Alabama| Business Intelligence Analysts| Research|125000| 98235.926749922| | Alabama|Business Intelligence and Analytics Managers| Microsoft Access| 97055|105651.29289851415| | Alabama| Consultants| Communication|130500|124869.98049877661| | Alabama| Data Analysts| Auditing| 93600| 83656.68287552655| | Alabama| Data Analysts| Invoice Review| 80400| 80371.22837983494| | Alabama| Data Analysts| Kibana| 95000| 89909.6430482245| +----------+--------------------------------------------+-------------------+------+------------------+ only showing top 10 rows
InĀ [6]:
# Logistic Regression model
log_reg = LogisticRegression(featuresCol="features", labelCol="ABOVE_AVERAGE_SALARY", maxIter=100)
# Create pipeline
log_pipeline = Pipeline(stages=[
state_indexer, title_indexer, skill_indexer,
state_encoder, title_encoder, skill_encoder,
assembler, log_reg
])
# Train the model
log_model = log_pipeline.fit(train_data)
# Make predictions
log_predictions = log_model.transform(test_data)
# Evaluate classification model
auc_evaluator = BinaryClassificationEvaluator(labelCol="ABOVE_AVERAGE_SALARY", metricName="areaUnderROC")
auc = auc_evaluator.evaluate(log_predictions)
# Calculate accuracy
accuracy = log_predictions.filter(
F.col("ABOVE_AVERAGE_SALARY") == F.col("prediction")
).count() / log_predictions.count()
print(f"\nLogistic Regression Model Performance:")
print(f" AUC-ROC: {auc:.4f}")
print(f" Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
# Confusion Matrix
print("\nConfusion Matrix:")
confusion_matrix = log_predictions.groupBy("ABOVE_AVERAGE_SALARY", "prediction").count()
confusion_matrix.orderBy("ABOVE_AVERAGE_SALARY", "prediction").show()
Logistic Regression Model Performance: AUC-ROC: 0.8779 Accuracy: 0.8081 (80.81%) Confusion Matrix: +--------------------+----------+-----+ |ABOVE_AVERAGE_SALARY|prediction|count| +--------------------+----------+-----+ | 0| 0.0| 2665| | 0| 1.0| 485| | 1| 0.0| 692| | 1| 1.0| 2290| +--------------------+----------+-----+
InĀ [7]:
# Extract the trained linear regression model
trained_lr = lr_model.stages[-1]
coefficients = trained_lr.coefficients
intercept = trained_lr.intercept
print(f"Model Intercept: ${intercept:,.2f}")
print(f"Number of features: {len(coefficients)}")
print(f"Sum of coefficients: {sum(coefficients):,.2f}")
Model Intercept: $117,965.27 Number of features: 4219 Sum of coefficients: -34,383,304.02
InĀ [8]:
pred_data = lr_predictions.select("SALARY", "prediction", "STATE_NAME", "TITLE_NAME").limit(1000).collect()
pred_sample = pd.DataFrame(pred_data, columns=["SALARY", "prediction", "STATE_NAME", "TITLE_NAME"])
fig1 = px.scatter(
pred_sample,
x="SALARY",
y="prediction",
title="Actual vs Predicted Salary (Linear Regression)",
labels={"SALARY": "Actual Salary ($)", "prediction": "Predicted Salary ($)"},
opacity=0.6,
hover_data=["STATE_NAME", "TITLE_NAME"],
color_discrete_sequence=['#78C2AD']
)
fig1.add_trace(go.Scatter(
x=[pred_sample["SALARY"].min(), pred_sample["SALARY"].max()],
y=[pred_sample["SALARY"].min(), pred_sample["SALARY"].max()],
mode='lines',
name='Perfect Prediction',
line=dict(color='red', dash='dash')
))
fig1.update_layout(
font=dict(
family="Verdana",
size=14,
color="black"
),
title=dict(
font=dict(
family="Verdana",
size=18,
color="black"
)
),
xaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
),
yaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
)
)
fig1.show()
state_salary_data = df_clean.groupBy("STATE_NAME").agg(
F.mean("SALARY").alias("avg_salary"),
F.count("SALARY").alias("job_count")
).orderBy(F.desc("avg_salary")).limit(10).collect()
state_salary = pd.DataFrame(state_salary_data, columns=["STATE_NAME", "avg_salary", "job_count"])
fig2 = px.bar(
state_salary,
x="STATE_NAME",
y="avg_salary",
title="Top 10 States by Average Salary",
labels={"STATE_NAME": "State", "avg_salary": "Average Salary ($)"},
color="avg_salary",
text="job_count",
color_continuous_scale=px.colors.sequential.Mint
)
fig2.update_traces(texttemplate='Jobs: %{text}', textposition='outside')
fig2.update_layout(
font=dict(
family="Verdana",
size=14,
color="black"
),
title=dict(
font=dict(
family="Verdana",
size=18,
color="black"
)
),
xaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
),
yaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
)
)
fig2.show()
salary_class_data = log_predictions.groupBy("ABOVE_AVERAGE_SALARY").count().collect()
salary_class = pd.DataFrame(salary_class_data, columns=["ABOVE_AVERAGE_SALARY", "count"])
salary_class["Category"] = salary_class["ABOVE_AVERAGE_SALARY"].map({0: "Below Average", 1: "Above Average"})
fig3 = px.pie(
salary_class,
values="count",
names="Category",
title="Distribution of Above/Below Average Salaries",
color_discrete_sequence=["#78C2AD", "#F3969A"]
)
fig3.update_layout(
font=dict(
family="Verdana",
size=14,
color="black"
),
title=dict(
font=dict(
family="Verdana",
size=18,
color="black"
)
),
xaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
),
yaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
)
)
fig3.show()
title_salary_data = df_clean.groupBy("TITLE_NAME").agg(
F.mean("SALARY").alias("avg_salary"),
F.count("SALARY").alias("count")
).filter(F.col("count") > 5).orderBy(F.desc("avg_salary")).limit(15).collect()
title_salary = pd.DataFrame(title_salary_data, columns=["TITLE_NAME", "avg_salary", "count"])
fig4 = px.bar(
title_salary,
x="avg_salary",
y="TITLE_NAME",
orientation='h',
title="Top 15 Job Titles by Average Salary (min 5 postings)",
labels={"TITLE_NAME": "Job Title", "avg_salary": "Average Salary ($)"},
color="avg_salary",
color_continuous_scale=px.colors.sequential.Mint
)
fig4.update_layout(
font=dict(
family="Verdana",
size=14,
color="black"
),
title=dict(
font=dict(
family="Verdana",
size=18,
color="black"
)
),
xaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
),
yaxis=dict(
title_font=dict(
family="Verdana",
size=14,
color="black"
)
)
)
fig4.show()
predictions_data = lr_predictions.select(
"STATE_NAME", "TITLE_NAME", "PRIMARY_SKILL",
"SALARY", "prediction", "ABOVE_AVERAGE_SALARY"
).limit(10000).collect()
predictions_pdf = pd.DataFrame(
predictions_data,
columns=["STATE_NAME", "TITLE_NAME", "PRIMARY_SKILL", "SALARY", "prediction", "ABOVE_AVERAGE_SALARY"]
)
predictions_pdf.to_csv("output/salary_predictions.csv", index=False)
print(f"Results saved to: output/salary_predictions.csv ({len(predictions_pdf):,} rows)")
Results saved to: output/salary_predictions.csv (6,132 rows)
InĀ [13]:
import pandas as pd
import matplotlib.pyplot as plt
# Load data
df = pd.read_csv("data/lightcast_job_postings.csv")
# View column names
print(df.columns)
C:\Users\jt-la\AppData\Local\Temp\ipykernel_34152\3608466472.py:5: DtypeWarning: Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
Index(['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES',
'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL',
...
'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3',
'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME',
'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6',
'NAICS_2022_6_NAME'],
dtype='object', length=131)
InĀ [15]:
import seaborn as sns
# Explode into one skill per row (reuse skills_exploded)
skills_loc = df.explode("SKILLS")
# Count top skills per location
top_loc_skills = (
skills_loc.groupby(["LOCATION", "SKILLS"])
.size()
.reset_index(name="Count")
)
# Plot top 10 for top 5 locations
top_locations = df["LOCATION"].value_counts().nlargest(5).index
for loc in top_locations:
top10 = (
top_loc_skills[top_loc_skills["LOCATION"] == loc]
.nlargest(10, "Count")
)
plt.figure(figsize=(10,6))
sns.barplot(y="SKILLS", x="Count", data=top10, palette="mako")
plt.title(f"Top 10 Skills in {loc}")
plt.show()
C:\Users\jt-la\AppData\Local\Temp\ipykernel_34152\3706716266.py:22: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
C:\Users\jt-la\AppData\Local\Temp\ipykernel_34152\3706716266.py:22: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
C:\Users\jt-la\AppData\Local\Temp\ipykernel_34152\3706716266.py:22: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
C:\Users\jt-la\AppData\Local\Temp\ipykernel_34152\3706716266.py:22: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
C:\Users\jt-la\AppData\Local\Temp\ipykernel_34152\3706716266.py:22: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
InĀ [16]:
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import numpy as np
# Prepare the data
X = df_clean[['STATE_INDEX', 'SALARY']].dropna()
soc_labels = df_clean.loc[X.index, 'SOC']
# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
# Define colors
colors = ['#3D7C6A', '#B14E53', '#297C8A']
# Create figure
fig = go.Figure()
# Add scatter plots for each cluster
for i in range(3):
mask = clusters == i
fig.add_trace(go.Scatter(
x=X.loc[mask, 'STATE_INDEX'],
y=X.loc[mask, 'SALARY'],
mode='markers',
name=f'Cluster {i + 1}',
marker=dict(
color=colors[i],
size=8,
opacity=0.6
),
text=[f'SOC: {soc}<br>State Index: {si:.1f}<br>Salary: ${sal:,.0f}'
for soc, si, sal in zip(soc_labels[mask],
X.loc[mask, 'STATE_INDEX'],
X.loc[mask, 'SALARY'])],
hovertemplate='%{text}<extra></extra>'
))
# Add centroids
fig.add_trace(go.Scatter(
x=centroids[:, 0],
y=centroids[:, 1],
mode='markers',
name='Centroids',
marker=dict(
color='black',
size=15,
symbol='x',
line=dict(width=2)
),
hovertemplate='Centroid<br>State Index: %{x:.1f}<br>Salary: $%{y:,.0f}<extra></extra>'
))
# Update layout
fig.update_layout(
title=dict(
text='KMeans Clustering: State Index vs Salary by SOC',
font=dict(family='Verdana', size=18)
),
xaxis=dict(
title=dict(text='State Index', font=dict(family='Verdana', size=14)),
tickfont=dict(family='Verdana', size=14)
),
yaxis=dict(
title=dict(text='Salary ($)', font=dict(family='Verdana', size=14)),
tickfont=dict(family='Verdana', size=14)
),
font=dict(family='Verdana', size=14),
legend=dict(font=dict(family='Verdana', size=14)),
hovermode='closest',
plot_bgcolor='#f8f9fa',
paper_bgcolor='white'
)
# Show the plot
fig.show()
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) Cell In[16], line 6 3 import numpy as np 5 # Prepare the data ----> 6 X = df_clean[['STATE_INDEX', 'SALARY']].dropna() 7 soc_labels = df_clean.loc[X.index, 'SOC'] 9 # Perform KMeans clustering File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\dataframe.py:3085, in DataFrame.__getitem__(self, item) 3083 return self.filter(item) 3084 elif isinstance(item, (list, tuple)): -> 3085 return self.select(*item) 3086 elif isinstance(item, int): 3087 jc = self._jdf.apply(self.columns[item]) File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\dataframe.py:3229, in DataFrame.select(self, *cols) 3184 def select(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] 3185 """Projects a set of expressions and returns a new :class:`DataFrame`. 3186 3187 .. versionadded:: 1.3.0 (...) 3227 +-----+---+ 3228 """ -> 3229 jdf = self._jdf.select(self._jcols(*cols)) 3230 return DataFrame(jdf, self.sparkSession) File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\java_gateway.py:1322, in JavaMember.__call__(self, *args) 1316 command = proto.CALL_COMMAND_NAME +\ 1317 self.command_header +\ 1318 args_command +\ 1319 proto.END_COMMAND_PART 1321 answer = self.gateway_client.send_command(command) -> 1322 return_value = get_return_value( 1323 answer, self.gateway_client, self.target_id, self.name) 1325 for temp_arg in temp_args: 1326 if hasattr(temp_arg, "_detach"): File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\errors\exceptions\captured.py:179, in capture_sql_exception.<locals>.deco(*a, **kw) 177 def deco(*a: Any, **kw: Any) -> Any: 178 try: --> 179 return f(*a, **kw) 180 except Py4JJavaError as e: 181 converted = convert_exception(e.java_exception) File c:\Users\jt-la\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". 328 format(target_id, ".", name), value) 329 else: 330 raise Py4JError( 331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n". 332 format(target_id, ".", name, value)) Py4JJavaError: An error occurred while calling o82.select. : java.lang.NoSuchMethodError: 'java.lang.String org.apache.spark.SparkThrowableHelper$.getMessage$default$3()' at org.apache.spark.sql.AnalysisException.<init>(AnalysisException.scala:57) at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:54) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$7(CheckAnalysis.scala:200) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$7$adapted(CheckAnalysis.scala:193) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:367) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$6(CheckAnalysis.scala:193) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$6$adapted(CheckAnalysis.scala:193) at scala.collection.immutable.Stream.foreach(Stream.scala:533) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1(CheckAnalysis.scala:193) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1$adapted(CheckAnalysis.scala:102) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:367) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis(CheckAnalysis.scala:102) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis$(CheckAnalysis.scala:97) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:188) at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:214) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330) at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:211) at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:76) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:185) at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:185) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:184) at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:76) at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66) at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:88) at org.apache.spark.sql.Dataset.withPlan(Dataset.scala:3927) at org.apache.spark.sql.Dataset.select(Dataset.scala:1518) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:568) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) at py4j.ClientServerConnection.run(ClientServerConnection.java:106) at java.base/java.lang.Thread.run(Thread.java:842)